title: “Final Project - Instagram Data” author: “Estrella Hurtado & Alina Valliani” date: “2024-04-04” output: html_document — ## Introduction Where we got the data What categories we used. (followers buckets, caption lengths, keyword buckets)
knitr::opts_chunk$set(warning = FALSE,message = FALSE)
library(tidyverse)
library(lubridate)
library(stringr)
library(dplyr)
library(plotly)
insta_data <- read_csv("instagram_data.csv")
glimpse(insta_data)
## Rows: 11,692
## Columns: 14
## $ owner_id <chr> "36063641", "36063641", "36063641", "36063641", "36063…
## $ owner_username <chr> "christendominique", "christendominique", "christendom…
## $ shortcode <chr> "C3_GS1ASeWI", "C38ivgNS3IX", "C35-Dd9SO1b", "C33TadDM…
## $ is_video <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
## $ caption <chr> "I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @ta3 X …
## $ comments <dbl> 268, 138, 1089, 271, 145, 143, 356, 132, 128, 884, 211…
## $ likes <dbl> 16382, 9267, 10100, 6943, 17158, 9683, 42906, 4287, 74…
## $ created_at <dbl> 1709326758, 1709241048, 1709154707, 1709065322, 170871…
## $ location <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ imageUrl <chr> "https://instagram.flba2-1.fna.fbcdn.net/v/t39.30808-6…
## $ multiple_images <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ username <chr> "christendominique", "christendominique", "christendom…
## $ followers <dbl> 2144626, 2144626, 2144626, 2144626, 2144626, 2144626, …
## $ following <dbl> 1021, 1021, 1021, 1021, 1021, 1021, 1021, 1021, 1021, …
Created new columns with calculated values
new_data<- insta_data %>% mutate(engagement = round((((likes+comments)/followers)*100),digits = 2),
follower_quantile = ntile(followers,4),
engagement_quantile = ntile(engagement,4),
post_timestamp = as_datetime(created_at),
post_time = format(round(post_timestamp,units = "hours"),format = "%H:%M"),caption_length = lengths(strsplit(caption, ' ')))
glimpse(new_data)
## Rows: 11,692
## Columns: 20
## $ owner_id <chr> "36063641", "36063641", "36063641", "36063641", "3…
## $ owner_username <chr> "christendominique", "christendominique", "christe…
## $ shortcode <chr> "C3_GS1ASeWI", "C38ivgNS3IX", "C35-Dd9SO1b", "C33T…
## $ is_video <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
## $ caption <chr> "I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @ta…
## $ comments <dbl> 268, 138, 1089, 271, 145, 143, 356, 132, 128, 884,…
## $ likes <dbl> 16382, 9267, 10100, 6943, 17158, 9683, 42906, 4287…
## $ created_at <dbl> 1709326758, 1709241048, 1709154707, 1709065322, 17…
## $ location <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ imageUrl <chr> "https://instagram.flba2-1.fna.fbcdn.net/v/t39.308…
## $ multiple_images <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ username <chr> "christendominique", "christendominique", "christe…
## $ followers <dbl> 2144626, 2144626, 2144626, 2144626, 2144626, 21446…
## $ following <dbl> 1021, 1021, 1021, 1021, 1021, 1021, 1021, 1021, 10…
## $ engagement <dbl> 0.78, 0.44, 0.52, 0.34, 0.81, 0.46, 2.02, 0.21, 0.…
## $ follower_quantile <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1,…
## $ engagement_quantile <int> 3, 2, 3, 2, 3, 2, 4, 2, 2, 4, 2, 2, 1, 3, 4, 2, 3,…
## $ post_timestamp <dttm> 2024-03-01 20:59:18, 2024-02-29 21:10:48, 2024-02…
## $ post_time <chr> "21:00", "21:00", "21:00", "20:00", "20:00", "20:0…
## $ caption_length <int> 12, 34, 81, 57, 17, 66, 50, 17, 8, 53, 17, 20, 90,…
Insights on the account follower distribution 1 is the lowest, 4 the highest
new_data %>% group_by(follower_quantile) %>% summarise(follower_mean = format(round(mean(followers),0),big.mark=','))
## # A tibble: 5 × 2
## follower_quantile follower_mean
## <int> <chr>
## 1 1 108,262
## 2 2 342,149
## 3 3 834,535
## 4 4 8,559,178
## 5 NA NA
engagement vs. follower count bucket
new_data %>% filter(engagement != is.na(engagement)) %>%
group_by(follower_quantile) %>% summarise(avg_eng = mean(engagement)) %>%
ggplot(aes(x = follower_quantile,y = avg_eng)) +
geom_line() + labs(x='Follower Count Quartile',y = 'Average Engagement')
new_data %>% filter(engagement != is.na(engagement)) %>% group_by(post_time) %>% summarise(mean(engagement),n())
## # A tibble: 24 × 3
## post_time `mean(engagement)` `n()`
## <chr> <dbl> <int>
## 1 00:00 2.08 261
## 2 01:00 1.88 267
## 3 02:00 1.71 238
## 4 03:00 2.99 178
## 5 04:00 2.37 138
## 6 05:00 3.34 125
## 7 06:00 2.38 145
## 8 07:00 1.59 185
## 9 08:00 3.81 223
## 10 09:00 1.93 293
## # ℹ 14 more rows
*When do posts get the most engagement?
We see the most engagement between the hours of 5AM, 8AM, 12 PM, 1 PM, 4PM and 5PM, During peak times of the day. This is showing avg engagement% by post local time
time_eng <- new_data %>% filter(engagement != is.na(engagement)) %>% group_by(post_time) %>% summarise(eng_mean = round(mean(engagement),1)) %>%
ggplot(aes(x = eng_mean,y = post_time,fill = as.factor(post_time))) +
geom_col(stat = 'identity') +
scale_x_continuous(labels = waiver()) +
labs(y = 'Posting Time', x = "Avg Engagement %") +
scale_fill_manual(values = c('05:00'="tomato",'08:00'="tomato",'12:00'="tomato",'13:00'="tomato",'16:00'="tomato",'17:00'="tomato"), guide = FALSE)
ggplotly(time_eng)
##Relationship between caption lengths and engagement
Highest engagement posts include captions with lengths x & y
new_data %>% filter(engagement != is.na(engagement)) %>%
mutate(caption_bucket = case_when(caption_length < 50 ~"<50",caption_length >=50 & caption_length<100~"50-100",caption_length>=100 & caption_length < 150 ~ "100-150",caption_length>=150 & caption_length < 200 ~ "150-200",caption_length>=200 & caption_length < 250 ~ "200-250",caption_length>=250 & caption_length < 300 ~ "250-300",caption_length>=300 & caption_length < 350 ~ "300-350",caption_length > 350 ~"350+")) %>% group_by(caption_bucket) %>%
summarise(avg_eng = mean(engagement)) %>% ggplot(aes(x = caption_bucket,y = avg_eng))+geom_point(size = 5) + labs(x = "Caption Length",y = "Average Engagement")+ scale_x_discrete(limits = c("<50","50-100","100-150","150-200","200-250","250-300","300-350","350+"))
##Pictures/Videos/Carousel vs. Engagement
Do pictures or videos get more engagement? Clearly single images get more engagement and carousels get less
new_data %>% filter(engagement != is.na(engagement)) %>% mutate(type = case_when(is_video == TRUE & multiple_images == FALSE ~"Video",
is_video == FALSE & multiple_images == FALSE ~ "Picture",
multiple_images == TRUE ~ "Carousel")) %>%
group_by(type) %>% summarise(avg_eng = mean(engagement)) %>%
ggplot(aes(x = type,y = avg_eng))+
geom_col() + labs(x = 'Content Type',y = 'Average Engagement')
We see pictures get more engagement
new_data %>%
ggplot(aes(x = likes, y = comments, group = is_video, color = is_video)) + geom_point() +scale_y_log10() + scale_x_log10() + scale_color_manual(name = "Type", labels = c("Picture", "Video"),values = c("blue","red"))